This file contains the scripts used for a contrastive analysis of English and Norwegian future constructions.
Load packages
library(tidyverse)
library(readxl)
library(party)
library(lattice)
library(Hmisc)
library(pdp)
library(collostructions) # available at sfla.ch
library(patchwork)
library(ggparty)
library(Boruta)
library(svglite)The English data were queried via CWB. SpokenBNC2014 was transformed to CWB format using this script. OANC was transformed to CWB using this script. Uncollapse the following code box to see the queries.
# Spoken BNC 2014
cqp -e
SPOKENBNC2014
[word="will|shall|going|wo|'?ll|gon"][word="to|n't|na"]?
cat Last > "will_shall_going_to_SPOKENBNC2014.txt"
# in addition, for taking priming effects into account:
cqp -e
SPOKENBNC2014;
set context 150 words;
show +u_who;
set ld "üüü";
set rd "üüü";
[word="will|shall|going|wo|'?ll|gon"][word="to|n't|na"]?;
cat Last > will_shall_going_to_SPOKENBNC2014_more_context.txt
In the text file, replace üüü by \t.
# OANC
cqp -e
OANCSPOKEN
set context s
show +lemma +pos
set PrintStructures "text_id, text_genre, text_file, turn_id, turn_age, turn_sex"
A = [word="will|shall|going|wo|ll|gon|gonna"][word="to|n't|na"]?
cat A > "oanc_spoken.txt"
# additional query for 'll files because the apostrophe
# was missing in the original query
cqp -e
OANCSPOKEN
set context s
show +lemma +pos
set PrintStructures "text_id, text_genre, text_file, turn_id, turn_age, turn_sex"
A = [word="'ll"]
cat A > "oanc_ll.txt"# read data: Norwegian
nor_bb <- read_xlsx("data/Norwegian/nota_bb_vil_skal_komer_til_a.xlsx", sheet = "bigbrother")
nor_nota <- read_xlsx("data/Norwegian/nota_bb_vil_skal_komer_til_a.xlsx", sheet = "nota")
nor <- rbind(nor_nota, nor_bb)
# clean column names
colnames(nor) <- gsub(" ", "_", colnames(nor))
# read data: English
oanc <- read_csv("data/English/OANC/oanc_spoken_going_to_shall_will_5000.csv")
bnc <- read_xlsx("data/English/SpokenBNC2014/SPOKENBNC2014_spoken_will_shall_going_to_sample_5000-final-priming.xlsx")Protasis/apodosis annotation columns are added automatically.
# add protasis/apodosis annotation to BNC
bnc$if_clause2 <- ifelse(bnc$if_clause=="if" & bnc$subordinate=="sub", "protasis", NA)
bnc$if_clause2 <- ifelse(bnc$if_clause=="if" & bnc$subordinate=="main", "apodosis", bnc$if_clause2)
bnc$if_clause2 <- ifelse(bnc$if_clause!="if", "no", bnc$if_clause2)
# add protasis/apodosis annotation to OANC
oanc$if_clause2 <- ifelse(oanc$if_clause=="if" & oanc$subordinate=="sub", "protasis", NA)
oanc$if_clause2 <- ifelse(oanc$if_clause=="if" & oanc$subordinate=="main", "apodosis", oanc$if_clause2)
oanc$if_clause2 <- ifelse(oanc$if_clause!="if", "no", oanc$if_clause2)
# add protasis/apodosis annotation to NOR
nor$if_clause2 <- ifelse(nor$`If-clause`=="if" & nor$Clause_type=="sub", "protasis", "NA")
nor$if_clause2 <- ifelse(nor$`If-clause`=="if" & nor$Clause_type=="main", "apodosis", nor$if_clause2)
nor$if_clause2 <- ifelse(nor$`If-clause`!="if", "no", nor$if_clause2)Also, we add columns with the outcome variable to the English data, with an additional column with a binary encoding (will / going to)
# OANC ---------
# binary:
oanc$cxn <- ifelse(oanc$Key %in% c("gonna", "going"), "going_to", "will")
# more fine-grained:
oanc$cxn1 <- case_when(oanc$Key == "will" ~ "will",
oanc$Key == "'ll" ~ "'ll",
oanc$Key == "shall" ~ "shall",
oanc$Key == "wo" ~ "won't",
oanc$Key == "going" ~ "going to",
oanc$Key == "gonna" ~ "gonna")
oanc$cxn1 <- factor(oanc$cxn1, levels = c("will", "'ll", "shall", "won't", "going to", "gonna"))
# BNC ---------
# binary
bnc$cxn <- ifelse(bnc$Key %in% c("'ll", "will", "wo"), "will", "going to")
bnc$cxn1 <- case_when(bnc$Key == "gon" ~ "gonna",
bnc$Key == "'ll" ~ "'ll",
bnc$Key == "going" ~ "going to",
bnc$Key == "will" ~ "will",
bnc$Key == "wo" ~ "won't",
bnc$Key == "shall" ~ "shall")
bnc$cxn1 <- factor(bnc$cxn1, levels = c("will", "'ll", "shall", "won't", "going to", "gonna"))Omit false hits:
Number of hits per corpus:
# Norwegian
left_join(setNames(as.data.frame(table(nor_nota$Cx)), c("Cxn", "NoTa")),
setNames(as.data.frame(table(nor_bb$Cx)), c("Cxn", "BB")))# English
left_join(setNames(as.data.frame(table(bnc$cxn1)), c("Cxn", "BNC")),
setNames(as.data.frame(table(oanc$cxn1)), c("Cxn", "OANC")))Fitting a conditional inference tree and a random forest to the Norwegian data:
# Norwegian: ---------------
# relevant variables as factors
nor$Cx <- factor(nor$Cx)
nor$Negative <- factor(nor$Negative)
nor$Interrogative <- factor(nor$Interrogative)
nor$if_clause2 <- factor(nor$if_clause2)
nor$Clause_type <- factor(nor$Clause_type)
# reduce number of levels for Lexeme
nor$Lexeme <- factor(nor$Lexeme)
nor$lexeme <- fct_lump_min(nor$Lexeme, min = 50, other_level = "other")
# more descriptive name
nor$Construction <- nor$Cx
# shorter cx labels (obsolete now that we're using ggparty)
nor$cxn <- case_when(nor$Cx == "kommer" ~ "k",
nor$Cx == "skal" ~ "s",
nor$Cx == "vil" ~ "v")
nor$cxn <- factor(nor$cxn)
# shorter if clause label
nor$if_clause <- nor$if_clause2
# CART tree
set.seed(19851003)
tr_nor <- ctree(Construction ~ Negative+Interrogative+if_clause+Clause_type,
data = nor)
# plot - adapted from https://ladal.edu.au/tree.html
# extract p-values
pvals <- unlist(nodeapply(tr_nor, ids = nodeids(tr_nor), function(n) info_node(n)$p.value))
pvals <- pvals[pvals <.05]
( tr_nor_plot <- ggparty(tr_nor) +
geom_edge() +
geom_edge_label() +
geom_node_label(line_list = list(aes(label = splitvar),
aes(label = paste0("N=", nodesize, ", p",
ifelse(pvals < .001, "<.001", paste0("=", round(pvals, 3)))),
size = 10)),
line_gpar = list(list(size = 13),
list(size = 10)),
ids = "inner") +
geom_node_label(aes(label = paste0("Node ", id, ", N = ", nodesize)),
ids = "terminal", nudge_y = -0.0, nudge_x = 0.01) +
geom_node_plot(gglist = list(
geom_bar(aes(x = "", fill = Construction),
position = position_fill(), color = "black"),
theme_minimal(),
scale_fill_grey(start = .4, end = .9),
scale_y_continuous(breaks = c(0, 1)),
xlab(""),
ylab("Probability"),
geom_text(aes(x = "", group = Construction,
label = after_stat(count)),
stat = "count", position = position_fill(), vjust = 1.1)),
shared_axis_labels = TRUE) )# export as pdf (ggsave not working for ggparty objects apparently)
# png("figures/tree_NO.png", width = 10, height = 6, un = "in", res = 300)
# plot(tr_nor)
# dev.off()For the publication, the plot has been slightly revised using Inkscape to prevent the overlap of the egde running from “negative” to the corresponding bar with the “clause_type” node.
# random forest
set.seed(19851003)
for_nor <- party::cforest(Cx ~ Negative+Interrogative+if_clause+Clause_type,
data = nor,
controls = cforest_unbiased(mtry = 2, ntree = 2000))
vi <- party::varimp(for_nor, conditional = TRUE)Plot the variable importance:
(vi_plot_nor <- as.data.frame(vi) %>% rownames_to_column() %>% ggplot(aes(y = fct_reorder(rowname, vi), x = vi)) +
geom_point(size = 5) + xlim(min(vi), max(vi)) +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor = element_blank()) + xlab("Conditional variable importance") + ylab("Variable") +
theme(axis.text = element_text(size = 18)) +
theme(axis.title = element_text(size = 18)) +
theme(strip.text = element_text(size = 18)) +
theme(legend.text = element_text(size = 18)) +
theme(legend.title = element_text(size = 18, face = "bold")) +
theme(text = element_text(size = 18)))Model accuracy
# proportion of correct predictions
sum(as.numeric(sapply(1:length(pred.nor), function(i) pred.nor[i] == nor$Cx[i]))) / length(pred.nor)## [1] 0.793072
# combine OANC and BNC2014 data
colnames(oanc)[which(colnames(oanc)=="gramm_person")] <- "Gramm_person"
colnames(oanc)[which(colnames(oanc)=="animacy")] <- "Animacy"
colnames(oanc)[which(colnames(oanc)=="question")] <- "Interrogative"
colnames(bnc)[which(colnames(bnc)=="question")] <- "Interrogative"
colnames(bnc)[which(colnames(bnc)=="subordinate")] <- "Clause_type"
colnames(oanc)[which(colnames(oanc)=="subordinate")] <- "Clause_type"
colnames(oanc)[which(colnames(oanc)=="negation")] <- "Negative"
colnames(bnc)[which(colnames(bnc)=="negation")] <- "Negative"
eng <- rbind(
mutate(select(oanc, Left, Key, Right, Clause_type, Gramm_person, Animacy, Interrogative, Negative, if_clause, if_clause2, cxn, cxn1, Lemma), corpus = "OANC-Spoken"),
mutate(select(bnc, Left, Key, Right, Clause_type, Gramm_person, Animacy, Interrogative, Negative, if_clause, if_clause2, cxn, cxn1, Lemma), corpus = "SpokenBNC2014")
)
# CART tree
eng$Negative <- factor(eng$Negative)
eng$Interrogative <- factor(eng$Interrogative)
eng$if_clause2 <- factor(eng$if_clause2)
eng$Clause_type <- factor(eng$Clause_type)
eng$cxn <- factor(eng$cxn)
eng$cxn1 <- factor(eng$cxn1)
# omit NAs
eng1 <- select(eng, Negative, Interrogative, if_clause2, Clause_type, cxn1, corpus, Lemma)
eng1 <- na.omit(eng1)
# abbreviate labels - obsolete using ggparty
eng1$cxn <- case_when(eng1$cxn1=="will" ~ "w",
eng1$cxn1=="shall" ~ "s",
eng1$cxn1=="won't" ~ "wnt",
eng1$cxn1=="going to" ~ "gt",
eng1$cxn1=="gonna" ~ "gna",
eng1$cxn1=="'ll" ~ "ll")
# add lexeme
eng1$Lexeme <- factor(eng1$Lemma)
# add lexeme with reduced number of levels (bin infrequent lemmas to "other")
eng1$lexeme <- fct_lump_min(eng1$Lexeme, min = 35)
# factor
eng1$Construction <- eng1$cxn1
eng1$cxn <- factor(eng1$cxn, levels = c("w", "ll", "wnt", "s", "gt", "gna"))
# rename if-clause column to increase readability of tree diagram
eng1$if_clause <- eng1$if_clause2
# corpus as factor
eng1$corpus <- factor(eng1$corpus)
# more descriptive: use language variety instead of corpus
eng1$Variety <- ifelse(eng1$corpus == "OANC-Spoken", "AmE", "BrE")
eng1$Variety <- factor(eng1$Variety)
set.seed(1985)
tree_en <- ctree(Construction ~ Negative+Interrogative+if_clause+Clause_type+Variety,
data = eng1)
# plot
# extract p-values
pvals <- unlist(nodeapply(tree_en, ids = nodeids(tree_en), function(n) info_node(n)$p.value))
pvals <- pvals[pvals <.05]
# plot:
( tr_en_plot <- ggparty(tree_en) +
geom_edge() +
geom_edge_label() +
geom_node_label(line_list = list(aes(label = splitvar),
aes(label = paste0("N=", nodesize, ", p",
ifelse(pvals < .001, "<.001", paste0("=", round(pvals, 3)))),
size = 10)),
line_gpar = list(list(size = 13),
list(size = 10)),
ids = "inner") +
geom_node_label(aes(label = paste0("Node ", id, ", N = ", nodesize)),
ids = "terminal", nudge_y = -0.0, nudge_x = 0.01) +
geom_node_plot(gglist = list(
geom_bar(aes(x = "", fill = Construction),
position = position_fill(), color = "black"),
theme_minimal(),
scale_fill_grey(start = .4, end = .9),
scale_y_continuous(breaks = c(0, 1)),
xlab(""),
ylab("Probability"),
geom_text(aes(x = "", group = Construction,
label = after_stat(count)),
stat = "count", position = position_fill(), vjust = 1.1)),
shared_axis_labels = TRUE) )# save as pdf via export pane
#dev.off()
# png("tree_en.png", width = 30, height = 10, un = "in", res = 300)
# plot(tree_en)
# dev.off()
# random forestset.seed(1985)
for_en <- party::cforest(Construction ~ Negative+Interrogative+if_clause+Clause_type+Variety,
data = eng1, controls = cforest_unbiased(mtry = 3, ntree = 2000))
vi_en <- party::varimp(for_en, conditional = TRUE)Visualize variable importance:
(vi_plot_en <- as.data.frame(vi_en) %>% rownames_to_column() %>% ggplot(aes(y = fct_reorder(rowname, vi_en), x = vi_en)) +
geom_point(size = 5) + xlim(min(vi_en), max(vi_en)) +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor = element_blank()) + xlab("Conditional variable importance") + ylab("Variable") +
theme(axis.text = element_text(size = 18)) +
theme(axis.title = element_text(size = 18)) +
theme(strip.text = element_text(size = 18)) +
theme(legend.text = element_text(size = 18)) +
theme(legend.title = element_text(size = 18, face = "bold")) +
theme(text = element_text(size = 18)))# both plots
vi_plot_nor + ggtitle("Norwegian") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
plot_spacer() +
vi_plot_en + ggtitle("English") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
plot_layout(widths = c(4,.5,4))Model accuracy (number of correct predictions / number of observations)
## pred.eng
## will 'll shall won't going to gonna
## 0 5873 0 473 2067 357
##
## will 'll shall won't going to gonna
## 1383 3302 133 443 2017 1492
##
## pred.eng will 'll shall won't going to gonna
## will 0 0 0 0 0 0
## 'll 922 3086 35 0 906 924
## shall 0 0 0 0 0 0
## won't 11 18 1 233 42 168
## going to 401 156 16 210 1013 271
## gonna 49 42 81 0 56 129
sum(as.numeric(sapply(1:length(pred.eng), function(i) pred.eng[i] == eng1$cxn1[i]))) / length(pred.eng)## [1] 0.5086659
# first Boruta model for Norwegian
nor$Lexeme <- factor(nor$Lexeme)
set.seed(19551105)
boruta01 <- Boruta(Cx ~ Negative+Interrogative+if_clause+Clause_type+Lexeme,
data = nor)
# decision
getConfirmedFormula(boruta01)## Cx ~ Lexeme + Clause_type + if_clause + Interrogative + Negative
## <environment: 0x7fe232da3928>
# Boruta model with confirmed formula
set.seed(19551105)
boruta02 <- Boruta(Cx ~ Lexeme + Clause_type + if_clause +
Interrogative + Negative,
data = nor)
(boruta_NOR <- as.data.frame(boruta02$ImpHistory) %>% pivot_longer(cols = 1:length(as.data.frame(boruta02$ImpHistory) )) %>% setNames(c("Variable", "Importance")) %>%
mutate(Type = ifelse(str_detect(Variable, "shadow"), "Control", "Predictor")) %>%
mutate(Type = factor(Type),
Variable = factor(Variable)) %>%
ggplot(aes(x = reorder(Variable, Importance, mean), y = Importance, fill = Type)) +
geom_boxplot() +
theme_bw() +
scale_fill_grey(start = .5, end = .9) +
xlab("Variable") + ylab("Importance") + ggtitle("Norwegian") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
theme(panel.grid.major.y = element_line(linewidth = .2, color = "grey75"),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()))# first Boruta model for English
set.seed(2015)
boruta_en01 <- Boruta(cxn ~ Negative+Interrogative+if_clause+Clause_type+Variety+Lexeme,
data = eng1)
# decision
getConfirmedFormula(boruta_en01)## cxn ~ Lexeme + Variety + Clause_type + if_clause + Interrogative +
## Negative
## <environment: 0x7fe1ddc4cef0>
# Boruta model with confirmed formula
boruta_en02 <- Boruta(cxn ~ Lexeme + Variety + Clause_type + if_clause + Interrogative + Negative,
data = eng1)
(boruta_EN <- as.data.frame(boruta_en02$ImpHistory) %>% pivot_longer(cols = 1:length(as.data.frame(boruta_en02$ImpHistory))) %>% setNames(c("Variable", "Importance")) %>%
mutate(Type = ifelse(str_detect(Variable, "shadow"), "Control", "Predictor")) %>%
mutate(Type = factor(Type),
Variable = factor(Variable)) %>%
ggplot(aes(x = reorder(Variable, Importance, mean), y = Importance, fill = Type)) +
geom_boxplot() +
theme_bw() +
scale_fill_grey(start = .5, end = .9) +
xlab("Variable") + ylab("Importance") + ggtitle("English") +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
theme(panel.grid.major.y = element_line(linewidth = .2, color = "grey75"),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()))# both plots
boruta_NOR +
theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
plot_spacer() + boruta_EN +
theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
plot_layout(guides = "collect", widths = c(4,1,4))(Multiple) distinctive collexeme analysis is used to find the lexical items that are most strongly associated with the three variants.
mdca_nor <- nor %>% select(Cx, Lexeme) %>% as.data.frame %>% collex.covar()
mdca_nor %>% filter(SLOT1=="vil")mdca_bnc <- bnc %>% select(cxn1, Lemma) %>% as.data.frame() %>% collex.covar()
mdca_bnc %>% filter(SLOT1=="going to")mdca_oanc <- oanc %>% select(cxn1, Lemma) %>% as.data.frame() %>% collex.covar()
mdca_oanc %>% filter(SLOT1=="going to")The following CWB commands give the lemma lists for SpokenBNC2014 and the spoken component of OANC:
cwb-scan-corpus spokenbnc2014 lemma > "spokenbnc2014_lemmas.tsv"
cwb-scan-corpus oancspoken lemma > "oancspoken_lemmas.tsv"The lemma lists can be used to conduct simple collexeme analyses for the individual constructions.